Take-home Exercise 3

In this Take-home Exercise, I will explore the economic of the city of Engagement, Ohio USA.

Che Xuan https://www.linkedin.com/in/jacob-che-xuan-b646a9123/
2022-05-15

Task

Challenge 3: Economic considers the financial health of the city. Over time, are businesses growing or shrinking? How are people changing jobs? Are standards of living improving or declining over time?

Consider the financial status of Engagement’s businesses and residents, and use visual analytic techniques to address these questions.

Overview

In this take-home exercise, appropriate static and interactive statistical graphics methods are used to reveal the economic of the city of Engagement, Ohio USA while addressing the questions stated in the Task section.

The data are processed by using appropriate tidyverse family of packages and the statistical graphics are prepared using ggplot2 and its extensions.

Sketch of Proposed Design

The picture below shows a sketch of the initial design proposed.

Installing & Launching R Packages

Before we get started, it is important for us to ensure that the required R packages have been installed. If yes, we will load the R packages. If they have yet to be installed, we will install the R packages and load them onto R environment.

The chunk code below will do the trick.

packages = c('tidyverse', 'ggdist', 'ggridges', 'patchwork', 'ggthemes', 'lubridate', 'ggiraph', 'gganimate', 'plotly', 'DT', 'crosstalk')

for(p in packages){
  if(!require(p, character.only = T)){
    install.packages(p)
  }
  library(p, character.only = T)
}

library(trelliscopejs)

Importing Data

The code chunk below imports Participants.csv from the data folder into R by using read_csv() of readr package and save it as an tibble data frame called participants.

travel <- read_csv("rawdata/TravelJournal.csv")
employers <- read_csv("rawdata/Employers.csv")

summary(travel)
 participantId    travelStartTime               travelStartLocationId
 Min.   :   0.0   Min.   :2022-03-01 05:00:00   Min.   :   1         
 1st Qu.: 221.0   1st Qu.:2022-06-10 17:35:00   1st Qu.: 449         
 Median : 464.0   Median :2022-10-03 18:40:00   Median : 913         
 Mean   : 480.5   Mean   :2022-10-05 05:21:39   Mean   :1016         
 3rd Qu.: 726.0   3rd Qu.:2023-01-28 06:20:00   3rd Qu.:1358         
 Max.   :1010.0   Max.   :2023-05-24 23:35:00   Max.   :1805         
                                                NA's   :1043         
 travelEndTime                 travelEndLocationId   purpose         
 Min.   :2022-03-01 05:35:00   Min.   :   0        Length:2099656    
 1st Qu.:2022-06-10 18:10:00   1st Qu.: 449        Class :character  
 Median :2022-10-03 19:00:00   Median : 910        Mode  :character  
 Mean   :2022-10-05 05:46:07   Mean   :1015                          
 3rd Qu.:2023-01-28 06:45:00   3rd Qu.:1358                          
 Max.   :2023-05-24 23:55:00   Max.   :1805                          
                                                                     
  checkInTime                   checkOutTime                
 Min.   :2022-03-01 05:35:00   Min.   :2022-03-01 06:00:00  
 1st Qu.:2022-06-10 18:10:00   1st Qu.:2022-06-10 21:40:00  
 Median :2022-10-03 19:00:00   Median :2022-10-03 22:47:30  
 Mean   :2022-10-05 05:46:07   Mean   :2022-10-05 09:53:15  
 3rd Qu.:2023-01-28 06:45:00   3rd Qu.:2023-01-28 08:30:00  
 Max.   :2023-05-24 23:55:00   Max.   :2023-05-25 00:05:00  
                                                            
 startingBalance    endingBalance     
 Min.   :  -681.6   Min.   :  -640.7  
 1st Qu.:  5077.8   1st Qu.:  5086.4  
 Median : 12006.9   Median : 12019.5  
 Mean   : 19573.7   Mean   : 19590.8  
 3rd Qu.: 25972.4   3rd Qu.: 25992.5  
 Max.   :240494.7   Max.   :240838.8  
                                      
summary(employers)
   employerId     location           buildingId    
 Min.   : 379   Length:253         Min.   :   3.0  
 1st Qu.: 829   Class :character   1st Qu.: 261.0  
 Median :1279   Mode  :character   Median : 486.0  
 Mean   :1089                      Mean   : 517.8  
 3rd Qu.:1734                      3rd Qu.: 782.0  
 Max.   :1797                      Max.   :1041.0  

Data Wrangling

travel$month <- factor(month(travel$`travelEndTime`), 
                    levels=1:12, 
                    labels=month.abb, 
                    ordered=TRUE)
travel$year <- year(travel$`travelEndTime`)
travel$year_month <- format(as.Date(travel$`travelEndTime`), "%Y-%m")
travel$day <- day(travel$`travelEndTime`)
travel$date <- date(travel$`travelEndTime`)
travel$wkday <- weekdays(travel$`travelEndTime`)

Vizualization

First we examine the overall trend of travel for all purpose

ggplot(data=travel, 
       aes(x = travelStartTime, 
           fill = purpose)) +
    geom_histogram(bins=15,
                   color="black") +
    scale_y_continuous(NULL,
                     breaks = NULL)

Next we use trellis plot to examine changes over time. However since the facet function of ggplot2 is not useful for visualizing large data (i.e. 1000 participants), we will use trelliscopejs instead.

travel_count <- travel %>%
  select(year_month, purpose, participantId) %>%
  group_by(year_month, purpose) %>%
  summarise(count = n()) %>%
  ungroup()
qplot(year_month, count, data = travel_count) +
  facet_wrap(~ purpose) +
  theme(axis.text.x=element_blank(), axis.ticks.x=element_blank())

Zooming into Work pattern

travel_to_work <- travel %>%
  filter(purpose == 'Work/Home Commute') %>%
  inner_join(y=employers, by = c("travelEndLocationId" = "employerId")) %>%
  select(participantId, travelEndTime, year, year_month, month, day, date, wkday, travelEndLocationId, purpose, location, buildingId) %>%
  rename('employerId' = 'travelEndLocationId')

Rename a value for simplicity

travel_to_work$purpose <- sub('Work/Home Commute', 
                              'Work',
                              travel_to_work$purpose)

doing a count:

travel_to_work_count <- travel_to_work %>%
  group_by(year_month, day) %>%
  summarise(count = n()) %>%
  ungroup()

by month analysis

ggplot() + 
  geom_line(data=travel_to_work_count,
            aes(x=day, 
                y=count, 
                group=year_month), 
            colour="black") +
  facet_grid(~year_month) +
  labs(axis.text.x = element_blank()) +
  xlab("") +
  ylab("No. of Visitors")

qplot(day, count, data = travel_to_work_count) +
  facet_wrap(~ year_month) +
  theme(axis.text.x=element_blank(), axis.ticks.x=element_blank())

qplot(day, count, data = travel_to_work_count) +
  facet_trelliscope(~ year_month, nrow = 2, ncol = 4, width = 600,
                    path = "trellis/",
                    self_contained=TRUE) +
  theme(axis.text.x=element_blank(), axis.ticks.x=element_blank())

Interactive Bar graph by month

travel_to_work_by_month <- travel_to_work %>%
  group_by(year_month, purpose) %>%
  summarise(count = n()) %>%
  ungroup()
travel_to_work_by_month$tooltip <- c(paste0(
  "Purpose = ", travel_to_work_by_month$purpose,
  "\n Count = ", travel_to_work_by_month$count))

p <- ggplot(data=travel_to_work_by_month, 
       aes(x = year_month, y = count)) +
  geom_bar_interactive(aes(tooltip = travel_to_work_by_month$tooltip,
                           data_id = year_month),
                       stat="identity")

girafe(
  ggobj = p,
  width_svg = 12,
  height_svg = 12*0.618
)
travel_to_work_daily_count <- travel_to_work %>%
  group_by(employerId, year_month, day, date, wkday) %>%
  summarise(count = n()) %>%
  ungroup()
travel_to_work_monthly_change <- travel_to_work_daily_count %>%
  group_by(employerId, year_month) %>%
  summarise(monthly_employees = max(count)) %>%
  mutate(mom_change = coalesce(monthly_employees - lead(monthly_employees),0),
         mom_turnover_rate = coalesce((monthly_employees - lead(monthly_employees))/monthly_employees,0)) %>%
  ungroup()
travel_to_work_mom <- travel_to_work_monthly_change %>%
  group_by(year_month) %>%
  summarise(avg_turnover = mean(mom_turnover_rate)) %>%
  ungroup()
travel_to_work_mom$tooltip <- c(paste0(
  "MOM Turnover % =", round(travel_to_work_mom$avg_turnover*100,1), '%'))

p2 <- ggplot(data=travel_to_work_mom, 
       aes(x = year_month, y = avg_turnover)) +
  geom_bar_interactive(aes(tooltip = travel_to_work_mom$tooltip,
                           data_id = year_month),
                       stat="identity")

girafe(code = print(p / p2),
       width_svg = 12,
       height_svg = 12,
       options = list(
         opts_hover(css = "fill: #202020;"),
         opts_hover_inv(css = "opacity:0.2;")
         )
       )

Now look into by wkday

wkday_levels <- c('Saturday', 'Friday', 
                  'Thursday', 'Wednesday', 
                  'Tuesday', 'Monday', 
                  'Sunday')

travel_to_work_by_day <- travel_to_work_daily_count %>%
  group_by(date, day, wkday, year_month) %>%
  summarise(daily_employees = sum(count)) %>%
  ungroup() %>%
  mutate(wkday = factor(
    wkday, levels = wkday_levels))
p3 <- ggplot(travel_to_work_by_day, 
       aes(year_month, 
           wkday, 
           fill = daily_employees)) + 
geom_tile(color = "white", 
          size = 0.1) + 
theme_tufte(base_family = "Helvetica") + 
coord_equal() +
scale_fill_gradient(name = "# of attacks",
                    low = "sky blue", 
                    high = "dark blue")

ggplotly(p3)
d <- highlight_key(travel_to_work_by_day)

p4 <- ggplot(d, aes(date, daily_employees)) +
  geom_line()

gg <- highlight(ggplotly(p4),
                "plotly_selected")
crosstalk::bscols(gg,
                  DT::datatable(d))